library(tidyverse)
library(reshape2)
library(ggeasy)
net_dir = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_net_MF/"
expression_dir = "/pastel/projects/speakeasy_dlpfc/"
work_dir = "/pastel/Github_scripts/SpeakEasy_dlpfc/bulk_dlpfc/"Hub genes are the most connected genes in a module so, here we list the top ones by module.
#############################
# Edges table
#############################
################################## Variables to set
# edge_weight2plot = 0.01 # Filter the edges by this value. This is the fraction of the top edge correlations. 0.1 = top 10%.
edge_weight_1 = 1 # We don't wanna plot the correlations = 1
correlation_method = "pearson"
################################## Done
# Expression data for a single set:
load(paste0(expression_dir, "exprdata_byregion.Rdata"))
expr_matx = as.data.frame(exprData_MF) # Residuals of the expression
gene_modules = read.table(paste0(net_dir, "geneBycluster.txt"), header = T, stringsAsFactors = F) # clusters from SpeakEasy
gene_modules_size = gene_modules %>% group_by(cluster_lv3) %>% summarise(n = n()) %>% filter(n >= 30)
n_modules = nrow(gene_modules_size)
nodes_df = data.frame()
edges_df = data.frame()
for(i in 1:n_modules){
# i=1
mod2plot = gene_modules_size$cluster_lv3[i]
modSize = gene_modules_size$n[i]
# Select the expression values for the module of interest
to_plot = gene_modules$ensembl[gene_modules$cluster_lv3 == mod2plot]
expr_matx_mod = expr_matx[to_plot, ]
#Let's calculate the correlation matrix
matx_cor = cor(t(expr_matx_mod), method = correlation_method) # must be gene by gene
matx_cor_m = melt(matx_cor)
colnames(matx_cor_m) = c("ensembl1", "ensembl2", "value")
matx_cor_temp = matx_cor_m %>% left_join(gene_modules[, c("ensembl", "symbol")], by=c("ensembl1" = "ensembl"))
colnames(matx_cor_temp) = c("ensembl1", "ensembl2", "value", "from_node")
# Complete correlation matrix
matx_cor_temp2 = matx_cor_temp %>% left_join(gene_modules[, c("ensembl", "symbol")], by=c("ensembl2" = "ensembl"))
colnames(matx_cor_temp2) = c("ensembl1", "ensembl2", "value", "from_node", "to_node")
# Filter the edges based on correlation (correlation is used as weight here)
edges_before_filtering = matx_cor_temp2[which(abs(matx_cor_temp2$value) != edge_weight_1), ]
#order_of_edges = order(abs(edges_before_filtering$value), decreasing = T)
#order_of_edges_filt = order_of_edges[1:round(length(order_of_edges)*edge_weight2plot)]
#edges_filtered = edges_before_filtering[order_of_edges_filt,]
# edges2keep = which(edges_before_filtering$value >= (median(abs(edges_before_filtering$value)))) # Get the edges > than the median
edges2keep = which(edges_before_filtering$value >= quantile(abs(edges_before_filtering$value), 3/4)) # Get the edges on the 3/4 quantiles
edges_filtered = edges_before_filtering[edges2keep,]
colnames(edges_filtered) = c("fromNode", "toNode", "weight", "fromAltName", "toAltName") # match names for Cytoscape input
# Remove NAs and duplicated ensembls
#edges_filtered = na.omit(edges_filtered)
#edges_filtered = edges_filtered[! duplicated(edges_filtered), ]
edges_filtered_df = edges_filtered
edges_filtered_df$module = mod2plot
edges_filtered_df$region = "MF"
edges_df = rbind(edges_df, edges_filtered_df) # Filtered edges table with all modules from all cell types
#############################
# Nodes table
#############################
nodes_table = gene_modules[ , c("ensembl", "cluster_lv3", "symbol")]
colnames(nodes_table) = c("nodeName", "nodeAttr", "altName") # Match for Cytoscape input
nodes_table = nodes_table[nodes_table$nodeAttr == mod2plot, ] # Only for the module of interest
# Apply same filter to the nodes table
nodes_filtered = nodes_table[nodes_table$nodeName %in% c(edges_filtered$fromNode,edges_filtered$toNode), ]
# Add number of node connections in the filtered network to get the hub nodes
nodes_filtered = inner_join(nodes_filtered,
bind_rows(edges_filtered %>% group_by(fromNode) %>% dplyr::count() %>% dplyr::rename(nodeName = fromNode),
edges_filtered %>% group_by(toNode) %>% dplyr::count() %>% dplyr::rename(nodeName = toNode)) %>%
group_by(nodeName) %>% distinct() %>% tally(wt = n))
hub_genes_list = nodes_filtered[order(nodes_filtered$n, decreasing = T), ]$altName[1:5] # Hubs are the most connected nodes in a module (after filtering the edges)
nodes_filtered_df = nodes_filtered[order(nodes_filtered$n, decreasing = T),]
nodes_filtered_df$module_size = modSize
nodes_filtered_df$region = "MF"
nodes_df = rbind(nodes_df, nodes_filtered_df) # Filtered nodes table with all modules from all cell types
}
# Top hub genes
nodes_df_top = nodes_df %>%
group_by(region, module_size) %>%
arrange(-n) %>%
slice_head(n = 5)
write.table(nodes_df_top, file = paste0(work_dir, "hub_genes_MF.txt"), sep = "\t", quote = F, row.names = F)
createDT(nodes_df_top)We want to check if the network has a scale-free topology.
Scale-free: many nodes with only a few links. A few hubs with large number of links.
Random: most nodes have the same number of links. No highly connected nodes resulting in a Poison distribution.
ggplot(nodes_df, aes(x=n)) +
geom_histogram(bins = 100, color="black", fill="white") +
easy_labs(x = "Number of links (k)", y = "Number of nodes with k links") +
theme_classic()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggeasy_0.1.3 reshape2_1.4.4 forcats_0.5.1 stringr_1.5.0
## [5] dplyr_1.1.3 purrr_1.0.2 readr_2.1.4 tidyr_1.3.0
## [9] tibble_3.2.1 ggplot2_3.4.4 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.11 lubridate_1.8.0 digest_0.6.33 utf8_1.2.4
## [5] R6_2.5.1 cellranger_1.1.0 plyr_1.8.9 backports_1.4.1
## [9] reprex_2.0.1 evaluate_0.23 highr_0.10 httr_1.4.7
## [13] pillar_1.9.0 rlang_1.1.2 readxl_1.3.1 rstudioapi_0.15.0
## [17] jquerylib_0.1.4 DT_0.30 rmarkdown_2.25 labeling_0.4.3
## [21] htmlwidgets_1.6.2 munsell_0.5.0 broom_1.0.5 compiler_4.1.2
## [25] modelr_0.1.8 xfun_0.41 pkgconfig_2.0.3 htmltools_0.5.7
## [29] tidyselect_1.2.0 fansi_1.0.5 crayon_1.5.2 tzdb_0.4.0
## [33] dbplyr_2.4.0 withr_2.5.2 grid_4.1.2 jsonlite_1.8.7
## [37] gtable_0.3.4 lifecycle_1.0.3 DBI_1.1.3 magrittr_2.0.3
## [41] scales_1.2.1 cli_3.6.1 stringi_1.7.12 cachem_1.0.8
## [45] farver_2.1.1 fs_1.6.3 xml2_1.3.5 bslib_0.5.1
## [49] ellipsis_0.3.2 generics_0.1.3 vctrs_0.6.4 tools_4.1.2
## [53] glue_1.6.2 hms_1.1.3 crosstalk_1.2.0 fastmap_1.1.1
## [57] yaml_2.3.7 colorspace_2.1-0 rvest_1.0.2 knitr_1.45
## [61] haven_2.4.3 sass_0.4.7